#-------------------------------------------------------#
#   Quality Assessment of the Academic Freedom Index    #
#-------------------------------------------------------#

# Title: "Quality Assessment of the Academic Freedom Index: Strengths, Weaknesses, and How Best to Use It" 
# Authors: "Lott, Lars", "Spannagel, Janika"
# date: 2024-09-19
# journal: Perspectives on Politics
# DOI: TBA
# written under "R and RStudio (4.4.1)"

#### Preliminaries ####

R.version$version.string

# clear workspace
rm(list=ls())

# load packages
library(here)
library(tidyverse)
library(magrittr)
library(reshape2)
library(ggpubr)
library(countrycode)
library(ggrepel)
library(ggthemes)
library(lubridate)
library(estimatr)
library(texreg)
library(readxl)
library(haven)
library(scales)
library(fixest)
library(marginaleffects)

set.seed(1234)

#---------------------------------------------------#
#--------------------load data ---------------------#
#---------------------------------------------------#

# load country_aggregated data
vdem_full_v13 <- readRDS(file.path("data", "vdem_v13", "vdem_13_cy", "V-Dem-CY-Full+Others-v13.rds"))

# Construct a baseline and small cy-dataset #
basic_df <- vdem_full_v13 %>%
  dplyr::select(country_text_id, year, country_name, e_polity2, e_pop, e_democracy_trans, e_democ, 
                starts_with("e_gdppc"), starts_with("e_pop"), starts_with("v2x_polyarchy"),
                starts_with("v2xca_academ"), starts_with("v2x_jucon"), starts_with("v2xlg_legcon"), e_regionpol, 
                e_regionpol_6C)  %>%
  mutate(dem_transition = ifelse(e_democracy_trans==1, 1, 0))

summary(basic_df)


#---------------------------------------------------#
#--------------------Load ERT data------------------#
#---------------------------------------------------#

# start inclusion: 0.05 
# cum inclusion: 0.1
# year turn: 0.03
# cum turn: 0.1
# tolerance: 5 years

ert_data <-  read.csv("data/vdem_v13/ERT-13/inst/ert.csv")

ert_data <- ert_data %>%
  group_by(dem_ep_id) %>%
  mutate(last_v2x_polyarchy = last(v2x_polyarchy), 
         first_v2x_polyarchy = first(v2x_polyarchy)) %>%
  ungroup() %>%
  mutate(v2x_polyarchy_increase = last_v2x_polyarchy - first_v2x_polyarchy)

ert_data <- ert_data %>%
  group_by(aut_ep_id) %>%
  mutate(last_v2x_polyarchy = last(v2x_polyarchy), 
         first_v2x_polyarchy = first(v2x_polyarchy)) %>%
  ungroup() %>%
  mutate(v2x_polyarchy_decrease = last_v2x_polyarchy - first_v2x_polyarchy) %>%
  dplyr::select(-c(last_v2x_polyarchy, first_v2x_polyarchy))

ert_data <- ert_data %>%
  mutate(v2x_polyarchy_increase = ifelse(is.na(dem_ep_id), NA,v2x_polyarchy_increase )) %>%
  mutate(v2x_polyarchy_decrease = ifelse(is.na(aut_ep_id), NA,v2x_polyarchy_decrease ))

ert_data <- ert_data %>%
  dplyr::select(-c(X, country_id, country_name, v2x_regime:v2x_polyarchy_codehigh))

basic_df <- basic_df %>%
  left_join(ert_data, by =c("country_text_id", "year"))

# Taking draws
func_draws <- function(df) {
  df_draws <- list()
  iter <- cbind(rep(1, nrow(df)))
  for(i in 1:1000){
    df_draws[[i]] <- df
    df_draws[[i]]$v2xca_academ_draw <- rnorm(iter, 
                                             mean = df$v2xca_academ, 
                                             sd = df$v2xca_academ_sd)
    df_draws[[i]]$v2x_polyarchy_draw <- rnorm(iter, 
                                              mean = df$v2x_polyarchy, 
                                              sd = df$v2x_polyarchy_sd)
    df_draws[[i]]$e_gdppc_draw <- rnorm(iter, 
                                      mean = df$e_gdppc, 
                                      sd = df$e_gdppc_sd)
    df_draws[[i]]$e_pop_draw <- rnorm(iter, 
                                      mean = df$e_pop, 
                                      sd = df$e_pop_sd)
    df_draws[[i]]$v2x_jucon_draw <- rnorm(iter, 
                                      mean = df$v2x_jucon, 
                                      sd = df$v2x_jucon_sd)
    df_draws[[i]]$v2xlg_legcon_draw <- rnorm(iter, 
                                      mean = df$v2xlg_legcon, 
                                      sd = df$v2xlg_legcon_sd)
    
  }
  df_draws
}

newdata <- func_draws(basic_df)
gc()

# Mutate Dataframes ##

mutate_function <- function(df) {
  df %>%
    group_by(e_regionpol, year) %>%
    mutate(regionpol_EDI = mean(v2x_polyarchy, na.rm = TRUE), 
           regionpol_EDI_draw = mean(v2x_polyarchy_draw, na.rm = TRUE)) %>%
    ungroup() %>%
    group_by(country_text_id) %>%
    mutate(e_gdppc_growth = ((e_gdppc - dplyr::lag(e_gdppc))/dplyr::lag(e_gdppc))*100) %>%
    mutate(e_gdppc_growth_roll5 = zoo::rollmean(e_gdppc_growth, k = 5, fill = NA),) %>%
    mutate(e_gdppc_growth_draw = ((e_gdppc_draw - dplyr::lag(e_gdppc_draw))/dplyr::lag(e_gdppc_draw))*100) %>%
    mutate(e_gdppc_growth_roll5_draw = zoo::rollmean(e_gdppc_growth_draw, k = 5, fill = NA)) %>%
    ungroup() %>%
    mutate(region_factor = as.factor(e_regionpol_6C),
           region_factor = relevel(region_factor, ref=1), 
           year_1900 =  year -1900,
           year_sq =year_1900^2)
  
}

newdata <- map(newdata, mutate_function)
gc()

## Add Lags ##
lag_function_variables <- function(df) {
  df %>%
    group_by(country_text_id) %>%
    mutate(v2xca_academ_draw_lag = lag(v2xca_academ_draw), 
           e_gdppc_draw_lag = lag(e_gdppc_draw), 
           e_pop_draw_lag = lag(e_pop_draw), 
           e_gdppc_growth_roll5_draw_lag = lag(e_gdppc_growth_roll5_draw), 
           regionpol_EDI_draw_lag = lag(regionpol_EDI_draw), 
           v2x_jucon_draw_lag = lag(v2x_jucon_draw), 
           v2xlg_legcon_draw_lag = lag(v2xlg_legcon_draw), 
           e_gdppc_lag = lag(e_gdppc), 
           e_pop_lag = lag(e_pop), 
           e_gdppc_growth_roll5_lag = lag(e_gdppc_growth_roll5), 
           regionpol_EDI_lag = lag(regionpol_EDI), 
           v2x_jucon_lag = lag(v2x_jucon), 
           v2xlg_legcon_lag = lag(v2xlg_legcon), 
    )
}

newdata <- map(newdata, lag_function_variables)
gc()

## Drop NAs ##

drop_na_function <- function(df) {
  df %>%
    drop_na(v2xca_academ_draw_lag, e_gdppc_lag, e_gdppc_growth_roll5_lag, regionpol_EDI_lag, 
            e_pop_lag,region_factor, v2x_jucon_lag, v2xlg_legcon_lag, year_1900, country_text_id) 
}

newdata <- map(newdata, drop_na_function)

## Drop Multi-year observations in a single episode ##

drop_multiple_auto_years <- function(df) {
  df %>%
    group_by(country_text_id) %>%
    mutate(aut_ep_onset = ifelse(aut_ep==1 & lag(aut_ep)== 1, NA, aut_ep)) %>%
    dplyr::select(country_text_id, year, aut_ep, aut_ep_onset, everything()) %>%
    drop_na(aut_ep_onset)
}

newdata <- map(newdata, drop_multiple_auto_years)

gc()
rm(ert_data,  vdem_full_v13)

## Step 4 Structure data with purr and map ##

all_imputations <- bind_rows(unclass(newdata), .id = "m") %>%
  group_by(m) %>%
  nest()

rm(newdata)

saveRDS(all_imputations, "data/all_imputations.rds")

all_imputations<- readRDS("data/all_imputations.rds")


## Step 5: run models with map and and built tidy summaries of those models directly in a data frame 
## Only main explanatory variable with measurement uncertainty 
start.time <- Sys.time()
model_m2  <- all_imputations %>%
  mutate(model = data %>% map(~ feglm(aut_ep_onset ~ poly(v2xca_academ_draw_lag, degree = 2) + 
                                        e_gdppc_lag + e_gdppc_growth_roll5_lag +  e_pop_lag +
                                        regionpol_EDI_lag + v2x_jucon_lag + v2xlg_legcon_lag + 
                                        year_1900 + year_sq | region_factor,  cluster = ~country_text_id, 
                                      family = binomial(link = "probit"), data =.)), 
         tidied = model %>% map(~ tidy(., conf.int = TRUE)),
         glance = model %>% map(~ glance(.)))
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken

## Step 6: run models with map and return tidy summaries of those models directly in a data frame
## All IVs with measurement uncertainty 
start.time <- Sys.time()
model_m3  <- all_imputations %>%
  mutate(model = data %>% map(~ feglm(aut_ep_onset ~  poly(v2xca_academ_draw_lag, degree = 2) + 
                                e_gdppc_draw_lag + e_gdppc_growth_roll5_draw_lag +  e_pop_draw_lag +
                                regionpol_EDI_draw_lag + v2x_jucon_draw_lag + v2xlg_legcon_draw_lag +
                                year_1900 + year_sq | region_factor,  cluster = ~country_text_id, 
                              family = binomial(link = "probit"), data =.)), 
tidied = model %>% map(~ tidy(., conf.int = TRUE)),
glance = model %>% map(~ glance(.)))
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken


#############################
## Model M2 ##

## Create a wide data frame of the coefficients, standard errors, p-Values, and confidence intervals ##

model_m2_params  <- model_m2 %>%
  unnest(tidied) %>%
  dplyr::select(m, term, estimate, std.error, p.value, conf.low, conf.high) %>%
  gather(key, value, estimate, std.error, p.value, conf.low, conf.high)

data_m1 <- all_imputations %>% filter(m=="1") %>% unnest()
m1 <- feglm(aut_ep_onset ~  poly(v2xca_academ_draw_lag, degree = 2) + 
                   e_gdppc_draw_lag + e_gdppc_growth_roll5_draw_lag +  e_pop_draw_lag +
                   regionpol_EDI_draw_lag + v2x_jucon_draw_lag + v2xlg_legcon_draw_lag +
                   year_1900 + year_sq | region_factor,  cluster = ~country_text_id, 
                 family = binomial(link = "probit"), data =data_m1)
model_degree_freedom <- degrees_freedom(m1, type = "resid")

rm(m1, data_m1)

model_m2_params_sum <- model_m2_params %>%
  filter(key == "estimate" | key == "std.error") %>%
  pivot_wider(names_from = key, values_from = value) %>%
  group_by(term) %>%
  summarize(Estimate = mean(estimate),
            se.within = mean(std.error), 
            se.between = var(estimate), 
            se.est = sqrt(se.within^2 + se.between*(1 + (1/length(model_m2$model))))) %>%
  mutate(model = "Model 2 - Uncertainty in main explanatory variable") %>%
  mutate(statistic = Estimate / se.est,
         conf.low = Estimate + se.est * qt(0.025, model_degree_freedom),
         conf.high = Estimate + se.est * qt(0.975, model_degree_freedom),
         p.value = 2 * pt(abs(statistic), model_degree_freedom, lower.tail = FALSE))


#############################
## Model M3 ##

model_m3_params  <- model_m3 %>%
  unnest(tidied) %>%
  dplyr::select(m, term, estimate, std.error, p.value, conf.low, conf.high) %>%
  gather(key, value, estimate, std.error, p.value, conf.low, conf.high)


model_m3_params_sum <- model_m3_params %>%
  filter(key == "estimate" | key == "std.error") %>%
  pivot_wider(names_from = key, values_from = value) %>%
  group_by(term) %>%
  summarize(Estimate = mean(estimate),
            se.within = mean(std.error), 
            se.between = var(estimate), 
            se.est = sqrt(se.within^2 + se.between*(1 + (1/length(model_m2$model))))) %>%
  mutate(model = "Model 3 - Uncertainty in all explanatory variables") %>%
  mutate(statistic = Estimate / se.est,
         conf.low = Estimate + se.est * qt(0.025, model_degree_freedom),
         conf.high = Estimate + se.est * qt(0.975, model_degree_freedom),
         p.value = 2 * pt(abs(statistic), model_degree_freedom, lower.tail = FALSE))

###############################################################################
## Model 1 without uncertainty ##

df_model_1 <- basic_df %>%
  group_by(e_regionpol, year) %>%
  mutate(regionpol_EDI = mean(v2x_polyarchy, na.rm = TRUE)) %>%
  ungroup() %>%
  group_by(country_text_id) %>%
  mutate(e_gdppc_growth = ((e_gdppc - dplyr::lag(e_gdppc))/dplyr::lag(e_gdppc))*100) %>%
  mutate(e_gdppc_growth_roll5 = zoo::rollmean(e_gdppc_growth, k = 5, fill = NA)) %>%
  ungroup() %>%
  mutate(region_factor = as.factor(e_regionpol_6C),
         region_factor = relevel(region_factor, ref=1), 
         year_1900 =  year -1900,
         year_sq =year_1900^2) %>%
  group_by(country_text_id) %>%
  mutate(v2xca_academ_lag = lag(v2xca_academ), 
         e_gdppc_lag = lag(e_gdppc), 
         e_pop_lag = lag(e_pop), 
         e_gdppc_growth_roll5_lag = lag(e_gdppc_growth_roll5), 
         regionpol_EDI_lag = lag(regionpol_EDI), 
         v2x_jucon_lag = lag(v2x_jucon), 
         v2xlg_legcon_lag = lag(v2xlg_legcon)
  ) %>%
  drop_na(v2xca_academ_lag, e_gdppc_lag, e_gdppc_growth_roll5_lag, regionpol_EDI_lag, 
          e_pop_lag,region_factor, v2x_jucon_lag, v2xlg_legcon_lag, year_1900, country_text_id) %>%
  group_by(country_text_id) %>%
  mutate(aut_ep_onset = ifelse(aut_ep==1 & lag(aut_ep)== 1, NA, aut_ep)) %>%
  dplyr::select(country_text_id, year, aut_ep, aut_ep_onset, everything()) %>%
  drop_na(aut_ep_onset)

m1 <- feglm(aut_ep_onset ~  poly(v2xca_academ_lag, degree = 2) + 
              e_gdppc_lag + e_gdppc_growth_roll5_lag +  e_pop_lag +
              regionpol_EDI_lag + v2x_jucon_lag + v2xlg_legcon_lag +
              year_1900 + year_sq | region_factor,  cluster = ~country_text_id, 
            family = binomial(link = "probit"), data =df_model_1)

## dataframe for non imputed computations ##
m1_estimate <- m1 %>% tidy(., conf.int = TRUE)

m1_estimate$model <- "Model 1 - Without Uncertainty"

model_m2_params_sum <- model_m2_params_sum %>%
  dplyr::rename(estimate = Estimate, 
                std.error = se.est) %>%
  dplyr::select(term, estimate, std.error, statistic, p.value, conf.low, conf.high, model)

model_m3_params_sum <- model_m3_params_sum %>%
  dplyr::rename(estimate = Estimate, 
                std.error = se.est) %>%
  dplyr::select(term, estimate, std.error, statistic, p.value, conf.low, conf.high, model)
  
  
#### Building the Coefficient Plot #####

data_figure <- rbind(m1_estimate, model_m2_params_sum, model_m3_params_sum) 

data_figure <- data_figure %>%
  mutate(term = case_when(term == "poly(v2xca_academ_lag, degree = 2)1" ~ "Academic Freedom Index", 
                          term == "poly(v2xca_academ_lag, degree = 2)2" ~ "Academic Freedom Index sqr.",
                          term == "e_gdppc_lag" ~ "GDP pc",
                          term == "e_gdppc_growth_roll5_lag" ~ "GDP growth",
                          term == "e_pop_lag" ~ "Population",
                          term == "regionpol_EDI_lag" ~ "Regional democracy level",
                          term == "v2x_jucon_lag" ~ "Judicial constraints on executive",
                          term == "v2xlg_legcon_lag" ~ "Legislative constraints on executive",
                          term == "year_1900" ~ "Year",
                          term == "year_sq" ~ "Year sqr.",
                          term == "poly(v2xca_academ_draw_lag, degree = 2)1" ~ "Academic Freedom Index",
                          term == "poly(v2xca_academ_draw_lag, degree = 2)2" ~ "Academic Freedom Index sqr.",
                          term == "year_1900" ~ "Year",
                          term == "year_1900" ~ "Year",
                          term == "year_1900" ~ "Year",
                          term == "e_gdppc_draw_lag" ~ "GDP pc",
                          term == "e_gdppc_growth_roll5_draw_lag" ~ "GDP growth",
                          term == "e_pop_draw_lag" ~ "Population",
                          term == "regionpol_EDI_draw_lag" ~ "Regional democracy level",
                          term == "v2x_jucon_draw_lag" ~ "Judicial constraints on executive",
                          term == "v2xlg_legcon_draw_lag" ~ "Legislative constraints on executive"))


color_pal <- c("#2980b9", "#d35400", "#1B9E77") 

f2 <- ggplot(data = subset(data_figure, str_detect(term, "Academic Freedom")), aes(y = term, color = model, group = model)) +
  geom_vline(xintercept = 0, colour = gray(1/2), lty = 2) +
  geom_linerange(aes(xmin = conf.low,
                     xmax = conf.high), size = 1, position = position_dodge(width=.5)) +
  geom_point(aes(x = estimate), size = 3, color = "black", shape = 19, position = position_dodge(width=.5)) +
  geom_point(aes(x = estimate), size = 1, color = "white", shape = 19, position = position_dodge(width=.5)) +
  labs(title="",x="Estimated Coefficients", y = "", color = "") +
  theme_bw() +
  scale_color_manual(values = color_pal) +
  theme(legend.position="bottom") +
  guides(color=guide_legend(nrow=3))

################################################################################
################################################################################

#### Marginaleffects ####

m2 <- list()

## Model 2 ##
for(i in 1:1000) {
  m2[[i]] <- plot_predictions(model_m2$model[[i]], condition = c("v2xca_academ_draw_lag"), draw=FALSE)
}

class(m2)

m2_df <- bind_rows(unclass(m2), .id = "m") %>%
  group_by(m) %>%
  nest()

## Create a wide data frame of the coefficients, standard errors, p-Values, and confidence intervals ##

m2_df  <- m2_df %>%
  unnest(data) %>%
  dplyr::select(m, rowid, estimate, p.value, s.value, conf.low, conf.high, v2xca_academ_draw_lag) 

model_m2_pred <- m2_df %>%
  group_by(rowid) %>%
  summarize(v2xca_academ_draw_lag =  mean(v2xca_academ_draw_lag), 
            Estimate = mean(estimate),
            conf.low = mean(conf.low), 
            conf.high = mean(conf.high)) %>%
  mutate(model = "Model 2 - Uncertainty in main explanatory variable") 

## Model 3## 
m3 <- list()

## Model 3 ##
for(i in 1:1000) {
  m3[[i]] <- plot_predictions(model_m3$model[[i]], condition = c("v2xca_academ_draw_lag"), draw=FALSE)
}

class(m3)

m3_df <- bind_rows(unclass(m3), .id = "m") %>%
  group_by(m) %>%
  nest()

## Create a wide data frame of the coefficients, standard errors, p-Values, and confidence intervals ##

m3_df  <- m3_df %>%
  unnest(data) %>%
  dplyr::select(m, rowid, estimate, p.value, s.value, conf.low, conf.high, v2xca_academ_draw_lag) 

model_m3_pred <- m3_df %>%
  group_by(rowid) %>%
  summarize(v2xca_academ_draw_lag =  mean(v2xca_academ_draw_lag), 
            Estimate = mean(estimate),
            conf.low = mean(conf.low), 
            conf.high = mean(conf.high)) %>%
  mutate(model = "Model 3 - Uncertainty in all explanatory variables") 


## Plot results for M2 ##
color_pal <- c("#2980b9", "#d35400", "#1B9E77") 


fig_m2_pred <- ggplot(model_m2_pred, aes(x = v2xca_academ_draw_lag, y = Estimate)) +
  geom_line(size = 1.1, color =  "#d35400") +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high),alpha = 0.3, fill =  "#d35400") +
  labs(y = "P(autocratization onset)", 
       x = "Academic Freedom Index Draws") +
  ylim(0, 0.05) +
  theme_bw()

## Plot result for M3 ##

fig_m3_pred <- ggplot(model_m3_pred, aes(x = v2xca_academ_draw_lag, y = Estimate)) +
  geom_line(size = 1.1, color = "#1B9E77") +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high),alpha = 0.3, fill = "#1B9E77") +
  labs(y = "P(autocratization onset)", 
       x = "Academic Freedom Index Draws") +
  ylim(0, 0.05) +
  theme_bw()

## Plot results for M1 ##

m1_pred <- plot_predictions(m1, condition = c("v2xca_academ_lag"), draw=FALSE)

fig_m1_pred <- ggplot(m1_pred, aes(x = v2xca_academ_lag, y = estimate)) +
  geom_line(size = 1.1, color = "#2980b9") +
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high),alpha = 0.3, fill = "#2980b9") +
  labs(y = "P(autocratization onset)", 
       x = "Academic Freedom Index") +
  theme_bw()


pred_fig <- ggpubr::ggarrange(fig_m1_pred, fig_m2_pred, fig_m3_pred,
                            nrow = 1, ncol = 3, legend = "bottom")


f3 <- ggpubr::ggarrange(f2, pred_fig, 
                  nrow = 2, ncol = 1, legend = "bottom", labels="AUTO")


ggsave(plot = f3, "outputs_original_data/Figure_8.pdf", width = 25,
       height = 15,
       units = c("cm"), dpi = 1000)
ggsave(plot = f3, "outputs_original_data/Figure_8.png", width = 25,
       height = 15,
       units = c("cm"), dpi = 1000)




